pip install usaddress
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import plotly.express as px
import matplotlib.ticker as mtick
from plotly import graph_objects as go
import re
import usaddress
# exploring the whole dataset to get general information
rest_data = pd.read_csv('/datasets/rest_data_us.csv')
rest_data.sample()
rest_data.shape
rest_data.tail()
rest_data.head()
rest_data.info(memory_usage='deep')
print(rest_data.duplicated().sum()) #searching for duplicated rows
rest_data.describe()
# exploring coulmns individually to look for unique/missing/duplicated/abnormal values
#--id--#
print(rest_data['id'].duplicated().sum())
The 'id' column has no duplicated values.
#--object_name--#
print(rest_data['object_name'].duplicated().sum())
rest_data['object_name'].value_counts()
#--address--#
print(rest_data['address'].duplicated().sum())
rest_data['address'].value_counts()
#--chain--#
print(rest_data['chain'].value_counts())
# Number of missing values
rest_data['chain'].isnull().sum()
#--object_type--#
rest_data['object_type'].value_counts()
#--number--#
rest_data['number'].value_counts()
After exploring the dataset, these are my conclusions:
General Information:
Missing Values/Abnormal Values/Abnormal Patterns:
Duplicated Rows:
By using the duplicated() and sum() methods I found out that there aren't duplicated rows in the dataset.
Popular Values:
# Plotting a graph of object_type
# grouping by establishment type
type_data = rest_data['object_type'].value_counts()
# setting the style
sns.set_style('whitegrid')
# setting the size
fig,ax=plt.subplots(figsize=(15,6))
# plotting the graph
ax = type_data.plot(kind='bar')
ax.set_title("The Proportions Of The Various Types Of Establishments In LA", fontsize=16, fontweight="bold")
ax.set_xlabel('ESTABLISHMENT TYPE', fontsize=16)
ax.set_ylabel('PROPORTION', fontsize=16)
# setting y axes as percentage
ax.yaxis.set_major_formatter(mtick.PercentFormatter(10000))
plt.legend('p')
plt.show()
# slicing the data by chain column
chain_data = rest_data['chain'].value_counts().to_dict()
# plotting a pie chart
fig = plt.figure(figsize=(20,9))
ax = fig.add_subplot()
ax.pie(chain_data.values(),
labels = chain_data.keys(),
autopct = '%1.1f%%',
textprops = {'fontsize' : 13, 'color': 'black'}
)
ax.set_title('The Proportions Of Chain And Nonchain Establishments')
ax.legend(loc = 'upper right', bbox_to_anchor = (1, 0, 0.5, 1), fontsize = 15, title= "TRUE/FALSE");
sns.set_style("whitegrid")
fig,ax=plt.subplots(figsize=(15,6))
sns.barplot(x="object_type", y="chain", data=rest_data)
ax.set_title("Establishments That Are Typically A Chain In LA", fontsize=15, fontweight="bold")
ax.set_xlabel('ESTABLISHMENT TYPE', fontsize=12)
ax.set_ylabel('CHAIN (%)', fontsize=12)
# adding the X axis
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1))
plt.axhline(y=0.5, color='black', linestyle='--', label='50% mark')
plt.legend()
plt.show()
# Finding the median value of number of seats
median_seats = rest_data['number'].median()
sns.set_style("whitegrid")
fig,ax=plt.subplots(figsize=(15,6))
sns.distplot(rest_data['number'])
ax.set_title("Distribution Of Seats In All Establishments", fontsize=15, fontweight="bold")
ax.set_xlabel('NUMBER OF SEATS', fontsize=12)
ax.set_ylabel('COUNT', fontsize=12)
ax.set(xlim=(0, 300))
ax.yaxis.set_major_formatter(mtick.PercentFormatter(0.1))
# Marking median seats
plt.axvline(x=median_seats, color='green', linestyle='--', label='median seats mark')
plt.legend()
plt.show()
# Slicing the data to show only chain establishments
chain_data = rest_data.query('chain== True')
chain_data = chain_data.groupby('number')['id'].agg('count').reset_index()
# Creating a function that separates the data into the two required groups
def seat_group(data):
number = data['number']
if number > 50:
return 'many'
else:
return 'few'
chain_data['seat_group'] = chain_data.apply(seat_group, axis = 1)
chain_data.tail() #test
# plotting a distribution graph
fig = px.bar(chain_data, x='number', y='id',color='seat_group', title='Distribution Of Seats In Chain Establishments',labels={'number':'Number Of Seats', 'id':'COUNT (ESTABLISHMENTS)'})
fig.update_layout(width=800,height=500)
fig.show()
# Finding the average number of seats for each type of restaurant
avg_seats = rest_data.groupby('object_type')['number'].agg('mean').reset_index()
avg_seats.sort_values('number', ascending=False)
# Applying the seat_group function on the rest_data
rest_data['seat_group'] = rest_data.apply(seat_group, axis = 1)
# Finding the proportion of average seats in restaurant and bar establishment types
print('Precentage of few and many average seats in restaurant and bar establishments')
print(rest_data.query("object_type in ['Restaurant','Bar']")['seat_group'].value_counts(
normalize=True) * 100)
print('')
# Finding the proportion of average seats in the other establishment types
print('Precentage of few and many average seats in other establishments')
rest_data.query("object_type not in ['Restaurant','Bar']")['seat_group'].value_counts(
normalize=True) * 100
# plotting a bar graph
sns.set_style("whitegrid")
fig,ax=plt.subplots(figsize=(15,6))
ax = sns.barplot(x="object_type", y="number", data=rest_data)
ax.set_title("The Average Number Of Seats For Each Type Of Establishment", fontsize=15, fontweight="bold")
ax.set_xlabel('ESTABLISHMENT TYPE', fontsize=12)
ax.set_ylabel('AVG NUMBER OF SEATS', fontsize=12)
plt.show()
# plotting a bar graph
sns.set_style("whitegrid")
fig,ax=plt.subplots(figsize=(15,6))
ax = sns.barplot(x="object_type", y="number", hue="chain", data=rest_data)
ax.set_title("The Average Number Of Seats For Each Type Of Establishment \n Chain And Non-chain Establishments", fontsize=15, fontweight="bold")
ax.set_xlabel('ESTABLISHMENT TYPE', fontsize=12)
ax.set_ylabel('AVG NUMBER OF SEATS', fontsize=12)
plt.show()
# exploring the non-chain bakery establishments due to the result on the graph
rest_data.loc[(rest_data['chain']== 'False') & (rest_data['object_type'] == 'Bakery')].count()
def street(raw):
if raw.endswith('3RDFL'):
street_name = '3425 E 1ST ST'
elif raw.endswith('LOB#2'):
street_name = '110 E 9TH ST ST B'
elif raw.endswith('PLAZA 35'):
street_name = 'GRAND AVENUE'
elif raw.startswith('123 ASTRONAUT'):
street_name = '123 ASTRONAUT E S ONIZUKA'
elif raw.startswith('123 E ASTRONAUT'):
street_name = '123 ASTRONAUT E S ONIZUKA'
elif raw.startswith('OLVERA'):
street_name = 'OLVERA, Los Angeles,USA'
elif raw.startswith('1033 1/2 LOS ANGELES'):
street_name = '1033 1/2 ANGELES ST, Los Angeles,USA'
else:
address = usaddress.tag(raw)
street_name = str(address[0]['StreetName'])
return street_name
# applying the function to the dataset and creating a new column to store the street names
rest_data['street_name'] = rest_data.address.apply(street)
rest_data
# Counting the number of establishments for each street name
streets_esta = rest_data['street_name'].value_counts()
top10_esta = streets_esta.head(10)
top10_esta
# setting the style
sns.set_style('whitegrid')
# setting the size
fig,ax=plt.subplots(figsize=(15,6))
# plotting the graph
ax = top10_esta.plot(kind='bar')
ax.set_title("The Top Ten Streets By Number Of All Establishments", fontsize=16, fontweight="bold")
ax.set_xlabel('STREET NAME', fontsize=16)
ax.set_ylabel('NUMBER OF RESTAURANTS', fontsize=16)
plt.show()
# creating new dataset with only cafe establishments
cafes = rest_data.query('object_type == "Cafe"')
# Counting the number of restaurants for each street name
streets_cafe = cafes['street_name'].value_counts()
top10_cafe = streets_cafe.head(10)
top10_cafe
# setting the style
sns.set_style('whitegrid')
# setting the size
fig,ax=plt.subplots(figsize=(15,6))
# plotting the graph
ax = top10_cafe.plot(kind='bar')
ax.set_title("The Top Ten Streets By Number Of Cafe Establishments", fontsize=16, fontweight="bold")
ax.set_xlabel('STREET NAME', fontsize=16)
ax.set_ylabel('NUMBER OF RESTAURANTS', fontsize=16)
plt.show()
# Finding the number of all streets
all_esta_streets = streets_esta.count()
print("The number of all streets with establishments:",all_esta_streets)
all_cafe_streets = streets_cafe.count()
print("The number of all streets with cafes:",all_cafe_streets)
# Finding the number of streets that have only one establishment
one_esta_street = streets_esta[streets_esta == 1].count()
print("The number of streets that have only one establishment:",one_esta_street)
one_cafe_street = streets_cafe[streets_cafe == 1].count()
print("The number of streets that have only one cafe:",one_cafe_street)
# Calculating the ratio
ratio_esta = one_esta_street/all_esta_streets * 100
ratio_cafe = one_cafe_street/all_cafe_streets * 100
print("The ratio between one establishment streets and all streets with establishments:",int(ratio_esta),'%')
print("The ratio between one cafe streets and all streets with cafes:",int(ratio_cafe),'%')
# plotting a distribution graph for all establishments
sns.set_style("whitegrid")
fig,ax=plt.subplots(figsize=(15,6))
sns.distplot(streets_esta)
ax.set_title("Distribution Of Number Of Establishments In A Street", fontsize=15, fontweight="bold")
ax.set_xlabel('NUMBER OF ESTABLISHMENTS', fontsize=12)
ax.set_ylabel('STREETS (%)', fontsize=12)
ax.set(xlim=(0, 404))
ax.yaxis.set_major_formatter(mtick.PercentFormatter(0.1))
plt.show()
# plotting a distribution graph for cafe establishments
sns.set_style("whitegrid")
fig,ax=plt.subplots(figsize=(15,6))
sns.distplot(streets_cafe)
ax.set_title("Distribution Of Number Of Cafe Establishments In A Street", fontsize=15, fontweight="bold")
ax.set_xlabel('NUMBER OF ESTABLISHMENTS', fontsize=12)
ax.set_ylabel('STREETS (%)', fontsize=12)
ax.set(xlim=(0, 30))
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1))
plt.show()
# Creating a dataset of streets with many establishments
many_esta = streets_esta[streets_esta > 50].reset_index()
many_esta = many_esta['index']
many_esta
# Slicing the data according to the street names that appear on the dataset above
many_esta_street = rest_data.query('street_name in @many_esta')
many_esta_seat = many_esta_street.groupby('number')['id'].agg('count').reset_index()
many_esta_seat.head()
# plotting a distribution graph
fig = px.bar(many_esta_seat, x='number', y='id', title='Distribution Of Seats on Establishments That Located In Popular Streets',labels={'number':'Number Of Seats', 'id':'COUNT (ESTABLISHMENTS)'})
fig.update_layout(width=800,height=500)
fig.show()
# Creating a dataset of streets with many cafe establishments
many_cafe = streets_cafe[streets_cafe > 7].reset_index()
many_cafe = many_cafe['index']
many_cafe
# Slicing the data according to the street names that appear on the dataset above
many_cafe_street = rest_data.query('street_name in @many_cafe and object_type == "Cafe"')
many_cafe_seat = many_cafe_street.groupby('number')['id'].agg('count').reset_index()
many_cafe_seat.head()
# plotting a distribution graph
fig = px.bar(many_cafe_seat, x='number', y='id', title='Distribution Of Seats on Cafe Establishments That Located In Popular Streets',labels={'number':'Number Of Seats', 'id':'COUNT (ESTABLISHMENTS)'})
fig.update_layout(width=800,height=500)
fig.show()
# plotting a distribution graph for cafe establishments
sns.set_style("whitegrid")
fig,ax=plt.subplots(figsize=(20,6))
plt.xticks(rotation=45)
plt.ylim(0,270)
sns.catplot(x="street_name", y="number",kind='violin',height=16, data=many_esta_street, edgecolor='.6',ax=ax)
plt.close(2)
ax.set_title("Distribution Of Number Of Seats For Establishments On Popular Streets", fontsize=15, fontweight="bold")
ax.set_xlabel('STREET NAME', fontsize=12)
ax.set_ylabel('NUMBER OF SEATS', fontsize=12)
plt.show()
# plotting a distribution graph for cafe establishments
sns.set_style("whitegrid")
fig,ax=plt.subplots(figsize=(20,6))
plt.xticks(rotation=45)
plt.ylim(0,270)
sns.catplot(x="street_name", y="number",kind='violin',height=16, data=many_cafe_street, edgecolor='.6',ax=ax)
plt.close(2)
ax.set_title("Distribution Of Number Of Seats For Cafe Establishments On Popular Streets", fontsize=15, fontweight="bold")
ax.set_xlabel('STREET NAME', fontsize=12)
ax.set_ylabel('NUMBER OF SEATS', fontsize=12)
plt.show()
After finding what is consider a popular street that has many estalishments and creating the graphs to display the number of seats for establishments, Here are my conclusions:
In conclusion, the trends are that establishments that located at popular streets have a similar amount of seats, they have no more than 50 seats, approximately around 30 seats.
Recommendation:
If you're considering opening a cafe on a popular street, keep in mind that in all of these streets the number of seats is low.
In conclusion, my recommendation, based on the data I was given is to not invest in opening a robot-run cafe establishment, due to the lack of popularity, the small number of seats on average, and the chances of it becoming a chain establishment.